This dataset contains 113937 loansbased on 81 variables. The data was downloaded from the udacity data archive through this link: https://www.google.com/url?q=https://s3.amazonaws.com/udacity-hosted-downloads/ud651/prosperLoanData.csv&sa=D&ust=1581581520570000
# I want to import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sb
%matplotlib inline
import warnings
warnings.simplefilter("ignore")
# I want to load dataset into variable data
data = pd.read_csv('data/prosperLoanData.csv')
# I want to set maximum number of columns to be displayed to number of columns in dataset
# To make all columns visible for visual assessment
pd.set_option('display.max_columns', data.shape[1])
#I want to display the first five rows of the data
data.head(5)
# I want to Check the shape of the data
print('Shape:- \nThe number of loans recorded is', data.shape[0])
print('The number of fields in the dataset is', data.shape[1], end = '\n\nDimension:- \n')
# I want to check the dimension of the data
print(f'The dataset is in {data.ndim} dimension')
# I want to Check columns in the dataset.
columns = data.columns.to_list()
columns
# I want to See the datatypes of all columns
datatype = data.dtypes.to_frame()
datatype.columns = ['dtype']
datatype['dtype'] = datatype['dtype'].astype('category')
datatype['dtype'] = datatype['dtype'].apply(lambda x: str(x))
datatype.T
Relative proportions of each datatype
# I want to Visualise the relative proportions of each datatype to other datatypes
# Calculate the relative proportions
no_of_obsevation = datatype['dtype'].shape[0]
max_proportion = datatype['dtype'].value_counts().to_list()[0]/no_of_obsevation
tick_proportions = np.arange(0, max_proportion, 0.03)
tick_names = ('{:0.2f}'.format(v) for v in tick_proportions)
#
plt.figure(figsize =(6,6))
order = datatype['dtype'].value_counts().index.to_list()
sb.countplot(x = "dtype" , data =datatype, order = order, color =sb.color_palette()[7],)
plt.xlabel('data types', fontdict = {'weight': 'bold'})
plt.ylabel('count of data types', fontdict = {'weight': 'bold'})
plt.yticks(tick_proportions * no_of_obsevation, tick_names);
#Calculate and display the percentage of each datatype
for i in range(datatype['dtype'].value_counts().shape[0]):
count = datatype['dtype'].value_counts().to_list()[i]
pct_string = '{:0.1f}%'.format(100*(count/no_of_obsevation))
plt.text(i,count+1, pct_string, ha = 'center');
#I want to extract columns of object datatype
object_columns = datatype[datatype['dtype'] == 'object'].index.to_list()
object_columns
# make a copy of original data to clean
data_copy = data.copy()
# From the data description CreditGrade is for loans before 2009 and ProsperRating (Alpha) is
#for loans from 2009
data_copy['ProsperRating (Alpha)'] = data_copy[['CreditGrade', 'ProsperRating (Alpha)']]\
['ProsperRating (Alpha)'].fillna(data_copy['CreditGrade'])
data_copy['CreditGrade'] = data_copy[['CreditGrade', 'ProsperRating (Alpha)']]['CreditGrade']\
.fillna(data_copy['ProsperRating (Alpha)'])
#test
data_copy[['CreditGrade', 'ProsperRating (Alpha)']].head(5)
Redefining categorical data type into ordinal and nominal
#I want to extract the quarter from the LoanOriginationQuarter column
data_copy['LoanOriginationQuarter'] = data_copy['LoanOriginationQuarter'].apply(lambda x: x.split(' ')[0])
# test
data_copy['LoanOriginationQuarter'][1]
#I want to define the ordinal and nominal data
# ordinal categorical
ordinal_categorical = {'ProsperRating (Alpha)':['NC','HR', 'E', 'D', 'C', 'B', 'A', 'AA'],
'CreditGrade': ['NC', 'HR', 'E', 'D', 'C', 'B', 'A', 'AA'],
'IncomeRange': ['$0', '$1 - 24,999', '$25,000 - 49,999', '$50,000 - 74,999',\
'$75,000 - 99,999', '$100,000+', 'Not employed', 'Not displayed'],
'LoanOriginationQuarter': ['Q1', 'Q2', 'Q3', 'Q4']
}
#nominal categorical
nominal_categorical = ['EmploymentStatus','LoanStatus', 'BorrowerState', 'Occupation']
for var in ordinal_categorical:
ordered_var = pd.api.types.CategoricalDtype(ordered = True,
categories = ordinal_categorical[var])
data_copy[var] = data_copy[var].astype(ordered_var)
for var in nominal_categorical:
data_copy[var] = data_copy[var].astype('category')
# test
print(data_copy[ordinal_categorical].dtypes, end = '\n\nnominal data \n')
print(data_copy[nominal_categorical].dtypes)
# remove column I won't need in the analysis
data_copy = data_copy[['LoanStatus', 'LenderYield', 'ProsperRating (Alpha)', 'ListingCategory (numeric)', \
'BorrowerState', 'Occupation', 'EmploymentStatus', 'IsBorrowerHomeowner', \
'CurrentlyInGroup', 'CreditScoreRangeLower', 'CreditScoreRangeUpper', \
'TotalCreditLinespast7years', 'DebtToIncomeRatio', 'IncomeRange', \
'IncomeVerifiable', 'LoanOriginationQuarter', 'MonthlyLoanPayment', \
'Recommendations', 'InvestmentFromFriendsCount', 'InvestmentFromFriendsAmount',\
'Investors', 'PercentFunded']]
# test
print(data_copy.columns.to_list())
data_copy.head()
#I want to for duplicate observations
data.duplicated().sum()
#I want to check for null values
data_copy.isnull().sum()
#I want to filter out names of columns with missing data
columns_with_missing_data = data_copy.columns[data_copy.isnull().sum() >0]
columns_with_missing_data
Visualizing missing data
#Visualize how much data is missing in each column
counts_of_missing_value = data_copy[columns_with_missing_data].isnull().sum()
percentage = counts_of_missing_value/data_copy.shape[0] * 100
sb.barplot(y = counts_of_missing_value.index, x =counts_of_missing_value, color = 'blue')
for i in range(percentage.shape[0]):
plt.text(x = counts_of_missing_value[i], y = i, s = '{:0.2f}%'.format(percentage[i]))
plt.xlabel('percentage of missing values', fontdict = {'weight': 'bold'})
plt.ylabel('missingvalues', fontdict = {'weight': 'bold'});
#In order not to lose many data , I want to check observattions that had missing values in
#more than one feature
def both_missing(column1, column2):
return len(data_copy[data_copy[column1].isna() & data_copy[column2].isna()])
print(both_missing('ProsperRating (Alpha)', 'DebtToIncomeRatio'))
print(both_missing('DebtToIncomeRatio', 'BorrowerState'))
print(both_missing('ProsperRating (Alpha)', 'BorrowerState'))
data_copy[columns_with_missing_data.to_list()].isna().sum()
print(both_missing('ProsperRating (Alpha)', 'DebtToIncomeRatio'))
print(both_missing('EmploymentStatus', 'Occupation'))
# since 'Occupation' is missing for every missing 'EmploymentStatus' values, I will drop all
#missing values in 'EmploymentStatus'
data_copy = data_copy.dropna(subset = ['EmploymentStatus'])
#test
data_copy['EmploymentStatus'].isna().sum()
data_copy[columns_with_missing_data.to_list()].isna().sum()
print(both_missing('ProsperRating (Alpha)', 'TotalCreditLinespast7years'))
print(both_missing('TotalCreditLinespast7years', 'DebtToIncomeRatio'))
print(both_missing('ProsperRating (Alpha)', 'DebtToIncomeRatio'))
# I will drop all null values in TotalCreditLinespast7years, to also get rid of null values in
#DebtToIncomeRatio
data_copy = data_copy.dropna(subset = ['TotalCreditLinespast7years'])
#test
data_copy['TotalCreditLinespast7years'].isna().sum()
print(both_missing('ProsperRating (Alpha)', 'DebtToIncomeRatio'))
# I will drop all null values in ProsperRating (Alpha), to also get rid of null values in
#DebtToIncomeRatio
data_copy = data_copy.dropna(subset = ['ProsperRating (Alpha)'])
#test
data_copy['ProsperRating (Alpha)'].isna().sum()
data_copy[columns_with_missing_data.to_list()].isna().sum()
# check the percentage of data remaining after dropping some observations
print('percentage of data left', (data_copy.shape[0]/data.shape[0]) * 100)
#check how much percentage of data is still missing for each variable
def percentage_missing(column):
value = data_copy[column].isna().sum() / data.shape[0] * 100
print('percentage of data still missing in', column, 'is', '{:0.2f}%'.format(value))
percentage_missing('BorrowerState')
percentage_missing('Occupation')
percentage_missing('DebtToIncomeRatio')
I want to check if the variables with missing data have missing data together in one observation
def index_generator(column_name):
return data_copy[column_name][data_copy[column_name].isna()].index.to_list()
#retrieve the index of observation with null values in the 'ProsperRating (Alpha)' variable
null_state =index_generator('BorrowerState')
##retrieve the index of observation with null values in the 'Occupation' variable
null_Occupation = index_generator('Occupation')
###retrieve the index of observation with null values in the 'DebtToIncomeRatio' variable
null_income = index_generator('DebtToIncomeRatio')
#compare the lists
def compare_lists(list1, list2):
index1 = [i for i in list1 if i in list2]
index2 = [i for i in list2 if i in list1]
return len(index1), len(index2)
#compare the null_Occupation and null_ProsperRating to see if they have similar indices
compare_lists(null_state, null_Occupation)
#compare the null_Occupation and null_income to see if they have similar indices
compare_lists(null_income, null_Occupation)
#compare the null_ProsperRating and null_income to see if they have similar indices
compare_lists(null_income, null_state)
#check the observations with the similar indices in null_ProsperRating and null_income
index = [i for i in null_income if i in null_state]
data.loc[index]
#check the percentage of data that will remain if i remove the observations with similar indices in
##null_state and null_income
((data_copy.shape[0] - len(index))/data.shape[0]) * 100
#since I will stil have 97.8% left, I want to drop the observations
data_copy = data_copy.drop(index)
# test
data_copy[columns_with_missing_data.to_list()].isna().sum()
# I want to drop the 'IncomeRange' variable because it has 76.74% missing values
data_copy.drop(['IncomeRange'], axis =1, inplace = True)
#check amount of missing data left
columns_with_missing_data = columns_with_missing_data.to_list()
columns_with_missing_data.remove('IncomeRange')
data_copy[columns_with_missing_data].isna().sum()
I want to interpolate the other missing data
data_copy[data_copy['Occupation'].notna()].sample(5)
data_copy[data_copy['Occupation'].isna()].sample(5)
I noticed observations with missing Occupation data has EmploymentStatus as Other, thus I will fill it with 'Not Stated'
data_copy['Occupation'] = data_copy['Occupation'].cat.add_categories('Not Stated')
data_copy['Occupation'].fillna('Not Stated', inplace = True)
data_copy['BorrowerState'] = data_copy['BorrowerState'].cat.add_categories('Not Stated')
data_copy['BorrowerState'].fillna('Not Stated', inplace = True)
data_copy[data_copy['DebtToIncomeRatio'].notna()].sample(5)
data_copy[data_copy['DebtToIncomeRatio'].isna()].sample(5)
#perform linear interpolation on DebtToIncomeRatio
data_copy['DebtToIncomeRatio'] = data_copy['DebtToIncomeRatio'].interpolate()
data_copy['DebtToIncomeRatio'].isna().sum()
#check for null values
data_copy.isna().sum()
#check percentage of data left after cleaning
print('percentage of data left after cleaning is {:0.02f}%'.format(data_copy.shape[0]/data.shape[0] * 100))
from the data description the column 'ListingCategory (numeric)' is encoded, I want to replace the values
data_copy['ListingCategory (numeric)'] = data_copy['ListingCategory (numeric)'].replace(\
{0: 'Not AVilable', 1: 'Debt Consolidation', 2: 'HOme Improvement',3: 'Business', 4: 'Personal Loan',\
5: 'Student Use', 6: 'Auto', 7: 'Other', 8: 'Baby & Adoption', 9: 'Boat', 10: 'Cosmetic Procedures',\
11: 'Engagement Ring', 12: 'Green Loans', 13: 'Household Expenses', 14: 'Large Purchases',\
15: 'Medical/ Dental', 16: 'Motorcycle', 17: 'RV', 18: 'Taxes', 19: 'Vacation', 20: 'Wedding Loans'})
data_copy['ListingCategory (numeric)'] = data_copy['ListingCategory (numeric)'].astype('category')
#test
print(data_copy['ListingCategory (numeric)'].dtype)
data_copy['ListingCategory (numeric)'].value_counts()
#create a column to give investments not from friends
data_copy['investments not from friends'] = data_copy['Investors'] -\
data_copy['InvestmentFromFriendsCount']
data_copy.rename(columns = {'ListingCategory (numeric)': 'ListingCategory'}, inplace = True)
data_copy.columns
data_copy.to_csv('data/cleaned_Loan_prosper_data.csv')
The dataset is a 2-dimension data with 113937 loan records and 81 features.
61.7%of the data is float,21.0%is categorical,13.6%is integer and3.7%is boolean.What is/are the main feature(s) of interest in your dataset?¶
The main feature of interest is the
PercentFunded. I want to see the factors that will affect 100% full funding
I will check the features: LoanStatuslenderYield prosper rating (Alpha)ListingCategory, BorrowerState, Occupation, EmploymentStatus, IsBorrowerHomeOwner, CurrentlyInGroup, CreditScoreRangeLower, CreditScoreRangeUpper, TotalCreditLinesInPast7Years, DebtToIncomeRatio, IncomeRange, IncomeVerifiable, LopanOriginationQuarter, MonthlyLoanPayment, Recommendations, InvestmentfromFriendsCounts, InvestmentfromFriendAmount, Investors to see how they correlate with percentfunded and to see if they can determine a 100% full funding. I will drop other columns because they are unique to each observation and are repition of some other columns.
data_copy.dtypes.to_frame().T
data_copy.sample(5)
data_copy.columns
plt.figure(figsize = (10,4))
plt.subplot(121)
sb.kdeplot(data = data_copy['PercentFunded'], color = 'brown')
plt.title('Distribution of all percentfunded', fontdict = {'fontsize': 18, 'color': 'brown'}, pad = 0.4)
plt.xlabel('Percent Funded', fontdict = {'weight': 'bold'})
plt.ylabel('counts', fontdict = {'weight': 'bold'})
plt.ylim(-4);
#Because the proportion of 1.0 is very large, I want to view the distribution of other percent funded
# in another subplot
plt.subplot(122)
not_including_1 = data_copy[data_copy['PercentFunded'] < 1]
bins = np.arange(not_including_1['PercentFunded'].min(), not_including_1['PercentFunded'].max() + 0.025, 0.025)
sb.histplot(data = not_including_1, x = 'PercentFunded', bins = bins, color = 'teal')
plt.title('Distribution of percentfunded excluding 1.0', fontdict = {'fontsize': 18, 'color': 'teal'}, pad = 0.4)
plt.xlim(0.7,0.99,0.25)
plt.xlabel('Percent Funded', fontdict = {'weight': 'bold'})
plt.ylabel('Density', fontdict = {'weight': 'bold'})
plt.subplots_adjust(wspace = 1.0)
#I want to check the percentage of loans that has PercentFunded of 1
(data_copy['PercentFunded'].value_counts().iloc[0]/ data_copy['PercentFunded'].shape[0]) * 100
The `PercentFunded` feature is highly skewed with 99.18% of the data being 1.0, thus performing transformation on the feature had no effect on the skewness. But from the observation, 99.18 percent of loans were fully funded.
Of the 0.82% left, most of the loans were 70% funded
def count(data, onx, sides_to_remove, title, fig_width, fig_height, rotation,\
sort_as_ordinal = None, rotatex = None, subplot = None):
'''To plot distributions of categorical data in form of counts
data: str, The elements of the categorical data,
onx: bool, If the categorical data should be on the x-axis,
sides_to_remove: list, The borders of the plot to remove,
title: str, The title of the plot,
fig_width: int, The width of the plot figure,
fig_height: int, The height of the plot figure,
rotation: int, text rotation,
sort_as_ordinal: bool,If the categorical elements should be sorted in order,
rotatex: bool, If x tick should be rotated,
subplot: int, geometry of subplots'''
value = data_copy[data].value_counts().index.to_list() if sort_as_ordinal == None else\
data_copy[data].value_counts().sort_index().index.to_list()
fig = plt.figure(figsize = (fig_width, fig_height))
ax1 = plt.subplot(111) if subplot == None else plt.subplot(subplot)
ax1.set_title(title, fontdict = {'color':'teal', 'fontsize':18,\
'fontweight': 'bold'}, pad = 10)
if onx:
sb.countplot(x = data, data = data_copy, order = value, color = 'teal', ax = ax1)
for i in range(len(value)):
count = data_copy[data].value_counts()[value[i]]
percent = (count/data_copy[data].shape[0]) * 100
plt.text(x = i, y = count , s = '{:0.2f}% value:({c})'.format(percent, c = count), rotation = 45 if\
rotation else 0)
plt.xlabel(f'{data}', fontdict = {'weight': 'bold'})
plt.ylabel(f'count of {data}', fontdict = {'weight': 'bold'})
plt.xticks(rotation = 0 if rotatex == True else 90);
else:
sb.countplot(y = data, data = data_copy, order = value, color = 'teal', ax = ax1)
plt.ylabel(f'{data}', fontdict = {'weight': 'bold'})
plt.xlabel(f'count of {data}', fontdict = {'weight': 'bold'})
for i in range(len(value)):
count = data_copy[data].value_counts()[value[i]]
percent = (count/data_copy[data].shape[0]) * 100
plt.text(y = i, x = count , s = '{:0.2f}% value:({c})'.format(percent, c = count), rotation = 45 if\
rotation else 0)
for i in sides_to_remove:
ax1.spines[i].set_visible(False)
sides = ['top', 'right']
count(data = 'ProsperRating (Alpha)', onx = True, sides_to_remove = sides,\
title ='Count of ProsperRating (Alpha)', fig_width = 10, fig_height = 6, rotation = True)
ProsperRating (Alpha) of C occurs the most and Nc occurs the least
sides = ['top', 'right']
count(data = 'ListingCategory', onx = True, sides_to_remove = sides,\
title ='Count of ListingCategory', fig_width = 10, fig_height = 4, rotation = True)
Most borrowers had taken a loan for debt Consolidation.
sides = ['top','right']
count(data = 'BorrowerState', onx = True, sides_to_remove = sides, title ='Count of state',\
fig_width = 15, fig_height = 8, rotation = True)
Most borrowers are from California while the least are from North Dakota and about 3.65% of the loan observations have the states not stated
sides = ['right', 'top']
count(data = 'LoanStatus', onx = False, sides_to_remove = sides, title ='Count of loan status',\
fig_width = 8, fig_height = 6, rotation = False)
Most of the loan staus are Current, completed
sides = ['right', 'top']
count(data = 'EmploymentStatus', onx = False, sides_to_remove = sides, title ='Count of employment status',\
fig_width = 8, fig_height = 6, rotation = False)
Most borrowers are Employed, with very few retired and not employed
also more borrowers work full time than part-time
sides = ['right', 'top']
count(data = 'Occupation', onx = False, sides_to_remove = sides, title ='Count of Occupation',\
fig_width = 15, fig_height = 15, rotation = False)
Most borrowers' Occupation is not specified
count(data = 'LoanOriginationQuarter', onx = True, sides_to_remove = ['top'], title ='Count of Quarter',\
fig_width = 8, fig_height = 6, rotation = True, sort_as_ordinal = True, rotatex = True)
The amount of loans originated in each quarter are almost similar, but Q4 had the highest
count('Recommendations', onx = False, sides_to_remove = ['right', 'top'], title = \
'Distribution of Recommendations', fig_width = 10, fig_height =8, rotation = False, \
sort_as_ordinal = True, rotatex = None, subplot = 211)
count('InvestmentFromFriendsCount', onx = False, sides_to_remove = ['right', 'top'], title = \
'Distribution of Investment From Friends Count', fig_width = 10, fig_height =8,
rotation = False, sort_as_ordinal = True, rotatex = None, subplot = 212)
Most borrowers had no Investment from friends and no recommendations
f, ax = plt.subplots(2, 2, figsize = (10,10))
color = ['brown', 'teal']
label = [ 'False','True']
def proportion_pie(data, axis_row, axis_column, title, explode, rotation1, rotate_index, wedgeprops = None,):
'''To plot distributions of categorical data in form of proportion
data: str, The elements of the categorical data,
axis_row: int, row position to plot on,
axis_column: int, column position to plot on,
title: str, title of plot,
explode: list, explode values to pass to pie plot,
rotation1: bool, If all texts should be rotated,
rotate_index: int, The index of the text to rotate if rotation1 is false,
wedgeprops: dict, The width of each proportion, to help form a donut chart
'''
data = data_copy[data].value_counts().sort_index()
axes = ax[axis_row][axis_column]
axes.set_title(title, backgroundcolor = 'black',\
fontdict = {'fontweight': 'bold', 'color': 'white'}, pad = 71)
explode = explode
if wedgeprops != None:
patches, texts, autotexts = axes.pie(data, colors = color, autopct = '%0.2f%%', startangle = 87,
explode = explode, wedgeprops = wedgeprops,\
textprops = {'color': 'white', 'fontweight' : 'bold'}, radius = 2);
else:
patches, texts, autotexts = axes.pie(data, colors = color, autopct = '%0.2f%%', startangle = 87,
explode = explode, textprops = {'color': 'white', 'fontweight' : 'bold'}, radius = 2);
if rotation1:
[text.set_rotation(90) for text in autotexts]
else:
autotexts[rotate_index].set_rotation(75)
legend_properties = {'weight': 'bold'}
f.legend(label, loc = 4 , facecolor = 'grey',borderpad = 4, labelspacing = 4,\
fontsize = 40, prop = legend_properties, labelcolor = 'white',\
bbox_to_anchor = (0, 0.5));
plt.subplots_adjust(wspace = 0.9, hspace = 0.9)
ax[1][1].set_visible(False)
return axes
proportion_pie('IsBorrowerHomeowner', 0, 0, 'proportion of Borrowers that are home owners', [0.07, 0],\
True,{'width': 1.0})
proportion_pie('CurrentlyInGroup', 0, 1, 'proportion of Borrowers that are in a group', [0, 0.07],\
False, 1)
proportion_pie('IncomeVerifiable', 1, 0, 'proportion of verifiable Income', [0, 0.07],\
False, 0)
plt.show()
def score_money_counts(fig_width, fig_height, data, title, subplot = None):
'''To plot distributions of numerical data in form of frequency
fig_width: int, The width of the plot figure,
fig_height: int, The height of the plot figure,
data: str, The elements of the numerical data,
title: str, The title of the plot,
subplot: int, geometry of subplots'''
plt.figure(figsize = (fig_width, fig_height))
ax1 = plt.subplot(111) if subplot == None else plt.subplot(subplot)
scores = data_copy[data].value_counts().sort_index()
index = [str(index) for index in scores.index]
sb.barplot(x = scores.values, y = index, color = 'brown')
plt.title(title, fontdict = \
{'color': 'brown', 'fontweight': 'bold'})
for i in range(len(index)):
percent = (scores.iloc[i]/data_copy['CreditScoreRangeLower'].shape[0]) * 100
plt.text(x = scores.iloc[i], y = i, s = '{:0.2f}%'.format(percent))
ax1.spines['right'].set_visible(False)
plt.ylabel(f'{data}', fontdict = {'weight': 'bold'})
plt.xlabel(f'count of {data}', fontdict = {'weight': 'bold'})
score_money_counts(8, 8, 'CreditScoreRangeLower', 'distribution of lower range of credit scores')
The highest amount of Lower credit score ranges from 640 to 720
score_money_counts(8, 8, 'CreditScoreRangeUpper', 'distribution of Upper range of credit scores')
The highest amount of Lower credit score ranges from 659 to 739
def money_hist(fig_width, fig_height, data, number_of_bins, title, more_than_a_subplot = False, \
use_another_data = False, another_data = None,):
'''To plot distributions of continuous numerical data in form of frequency
fig_width: int, The width of the plot figure,
fig_height: int, The height of the plot figure,
data: str, The elements of the numerical data,
number_of_bins: int, number of bins,
title: str, The title of the plot,
more_than_a_subplot: bool, If there should be more than a plot,
use_another_data: bool, If another data should be used in the second subplot,
another_data: str, The data to use if use_another_data is True'''
plt.figure(figsize = (fig_width, fig_height))
ax = plt.subplot(121)
data_to_use = data_copy
bin_number = data_to_use[data[0]].max() / number_of_bins
bins = np.arange(data_to_use[data[0]].min(), data_to_use[data[0]].max() +bin_number, bin_number)
sb.histplot(x = data[0], data = data_to_use, ax = ax, bins = bins)
ax.spines['top'].set_visible(False)
plt.title(title if isinstance(title, str) else title[0],
fontdict = {'fontsize': 40})
plt.xlabel(f'{data[0]}', fontdict = {'weight': 'bold'})
plt.ylabel(f'frequency of {data[0]}', fontdict = {'weight': 'bold'})
if more_than_a_subplot:
ax = plt.subplot(122)
if use_another_data:
data_to_use = another_data
else:
data_to_use = data_copy
bin_number = data_to_use[data[1]].max() / number_of_bins
bins = np.arange(data_to_use[data[1]].min(), data_to_use[data[1]].max() +bin_number, bin_number)
sb.histplot(x = data[1], data = data_to_use, ax = ax, bins = bins)
ax.spines['top'].set_visible(False)
plt.title(title[1], fontdict = {'fontsize': 40})
plt.xlabel(f'{data[1]}', fontdict = {'weight': 'bold'})
plt.ylabel(f'frequency of {data[1]}', fontdict = {'weight': 'bold'})
plt.subplots_adjust(wspace = 0.9)
plt.tight_layout()
def log_money_hist(fig_width, fig_height, data, number_of_bins, title, more_than_a_subplot = False, \
use_another_data = False, another_data = None, lim_tick1 = None, lim_tick2 = None):
'''To plot log distributions of continuous numerical data in form of frequency
fig_width: int, The width of the plot figure,
fig_height: int, The height of the plot figure,
data: str, The elements of the numerical data,
number_of_bins: int, number of bins,
title: str, The title of the plot,
more_than_a_subplot: bool, If there should be more than a plot,
use_another_data: bool, If another data should be used in the second subplot,
another_data: str, The data to use if use_another_data is True
lim_tick1: list, The limit and tick of the first subplot,
lim_tick2: list, The limit and tick of the second subplot'''
plt.figure(figsize = (fig_width, fig_height))
def hist_plot(axes, data_index, data_selected, limit = None, tick = None):
ax = plt.subplot(axes)
bin_number = data_copy[data[data_index]].max() / 10 ** np.log(number_of_bins)
bins = 10 ** np.arange(np.log(data_copy[data[data_index]] + 1).min(), \
np.log(data_copy[data[data_index]] + 1).max() +bin_number, bin_number)
sb.histplot(x = data[data_index], data = data_selected, ax = ax, bins = bins)
ax.spines['top'].set_visible(False)
plt.xscale('log')
plt.title(title if isinstance(title, str) else title[data_index],\
fontdict = {'fontsize': 40})
plt.xlabel(f'{data[data_index]}', fontdict = {'weight': 'bold'})
plt.ylabel(f'frequency of {data[data_index]}', fontdict = {'weight': 'bold'})
if limit != None:
plt.xlim(limit)
if tick != None:
plt.xticks(tick[0], tick[1])
if lim_tick1 == None:
hist_plot(121, 0, data_copy)
else:
hist_plot(121, 0, data_copy, limit = lim_tick1[0], tick = lim_tick1[1])
if more_than_a_subplot:
if use_another_data: # truth value ambiguous if I use argument to take dataframe in condition
data_to_use = another_data
else:
data_to_use = data_copy
if lim_tick2 == None:
hist_plot(122, 1, data_to_use)
else:
hist_plot(122, 1, data_copy, limit = lim_tick2[0], tick = lim_tick2[1])
money_hist(fig_width = 20, fig_height = 5, data = ['LenderYield'],\
number_of_bins = 27, title = 'Distribution of LenderYield',\
)
The `LenderYield` is a unimodal distribution with the highest close to 0.15.
It seems there are outliers less than 0.05
len(data_copy[data_copy['LenderYield'] < 0.05])
Because they are 390 in number that are less than 0,05, I think they are few cases and not outliers
money_hist(fig_width = 20, fig_height = 5, data = ['MonthlyLoanPayment'],\
number_of_bins = 27, title = 'Distribution of MonthlyLoanPayment',\
)
MonthlyLoanPayment plotted on a standard scale is skewed.
log_money_hist(fig_width = 20, fig_height = 8, data = ['MonthlyLoanPayment'], number_of_bins = 70,\
title = 'Logarithm Distribution of MonthlyLoanPayment', lim_tick1 = [(1e1, 1e4),[[1e1, 1.5e1,5e1, 1e2, 1.75e2,2.5e2,5e2, 1e3, 2.5e3,5e3],\
['10', '15','50', '100', '175','250','500', '1000', '2500','5000']]])
On the logarithm scale the mode is between 100 and 175
money_hist(fig_width = 20, fig_height = 5, data = ['TotalCreditLinespast7years'],\
number_of_bins = 27, title = 'Distribution of Total Credit Lines in past 7 years',\
)
Total credit lines between 20 and 30 are more but the distribution is skewed a little.
log_money_hist(fig_width = 20, fig_height = 8, data = ['TotalCreditLinespast7years'], number_of_bins = 30,\
title = 'Logarithm Distribution of TotalCreditLinespast7years', lim_tick1 = \
[(1e1, 2.5e2), [[5e0-1, 1e1-1, 1.5e1-1, 2e1-1,2.5e1-1, 3.5e1-1, 5e1-1, 1e2-1],\
['5','10', '15', '20','25', '35', '50', '100']]])
The logarithmic distribution of total credit lines is roughly bimodal, One mode between 20 and 25 and the other between 35 and 50. There is a steep jump before 15
money_hist(fig_width = 20, fig_height = 5, data =['DebtToIncomeRatio'], number_of_bins = 50,\
title = 'Distribution of Debt To Income Ratio')
The distribution of Debt to income ratio is skewed and it seems there is an outlier value close to 10
len(data_copy[data_copy['DebtToIncomeRatio'] > 10])
208 numbers of oservation highern than 10 indicate they are not outliers
log_money_hist(fig_width = 20, fig_height = 8, data = ['DebtToIncomeRatio'], number_of_bins = 10,\
title = 'Logarithm Distribution of Debt To Income Ratio')
plt.xlim(0.9e0, 1.5e1);
plt.xticks([1e0, 2e0, 3e0, 4e0, 5e0, 6e0, 7e0, 8e0, 9e0, 1e1, 1.2e1,1.5e1],\
['1','2','3','4','5','6','7','8','9','10', '12','15']);
The logarithm distribution of Debt-to-Income ratio is unimodal with even ratios close to 10 being the highest
money_hist(fig_width = 50, fig_height = 15, data = ['Investors', 'investments not from friends']\
,number_of_bins = 20, title = ['Distribution of Investors', \
'Distribution of Investments not from friends'],\
more_than_a_subplot = True)
The distribution from Investors and Investors not friends is highly skewed
log_money_hist(fig_width = 50, fig_height = 10, data = ['Investors', 'investments not from friends']\
,number_of_bins = 50, title = ['Logarithm Distribution of Investors', \
'Logarithm Distribution of Investments not from friends'],\
more_than_a_subplot = True, lim_tick1 = [(1e0, 1.5e3),
[[1e0, 5e0, 1e1, 5e1,1e2, 5e2, 1e3],
['1','5','10', '50', '100', '500', '1000']]],
lim_tick2 = [(0.9e0, 1.5e3),[[1e0, 2e0,5e0, 1e1, 5e1,1e2, 5e2, 1e3],
['1', '2','5','10', '50', '100', '500', '1000']]])
The Investors data on the logarithmic scale has a normal distribution with the mode being between 40 and 200.
The Investment not from friends data has the mode between 1 and 1.5 with the other figures forming a roughly normal distribution with a little skew to the left
newdata = data_copy[data_copy['InvestmentFromFriendsAmount'] > 1000]
money_hist(fig_width = 37, fig_height = 15, data = ['InvestmentFromFriendsAmount', 'InvestmentFromFriendsAmount'],\
number_of_bins = 30, title = ['Distribution of InvestmentFromFriendsAmount',\
'Distribution of Investment From Friends Amount greater than 1000'],
use_another_data = True,another_data = newdata, more_than_a_subplot = True)
The data on investments amount from friends ranging between 0 and 1000 occurs far more than other amounts so I made another plot to check distribution of the data part greater than 1000
log_money_hist(fig_width = 20, fig_height = 10, data = ['InvestmentFromFriendsAmount'],\
number_of_bins = 150, title = 'Logarithm Distribution of InvestmentFromFriendsAmount',
lim_tick1 = [(0.9e1, 5e4),[[1e1, 5e1,1e2, 5e2, 1e3, 5e3, 1e4, 5e4, 1e5],
['10', '50', '100', '500', '1k', '5k', '10k', '50k', '100k']]],)
The distribution of investments amount from friends on the logarithmic scale is roughly bimodal with the mode covering values between 100 and 150 and the other between 30 and 45
Most of the values, more than 90% in the variable were 1, so the first visualisation was highly skewed and the frequency of other values could not be seen. I tried to transform the scale but because of the very large proportion for PercentFunded value of 1, It could not solve the problem so I divided the data into two:
- Loans that were not fully funded(PercentFunded < 1)
- Loans that were fully funded (PercentFunded > 1)
- The monthly loan payment was skewed and It almost seemed there was an outlier between values 1500 and 2000, so I performed log transformations on the data to understand the distribution better
- The Debt-to-income-ratio was skewed and It almost seemed there was an outlier at value 10, so I performed log transformations on the data to understand the distribution better
- The Investors, Investments not from friends were skewed, so I performed log transformations on the data to understand the distribution better
- The InvestmentFromFriendsAmount was highly skewed, I plotted for monthly payment > 1000 to see the distribution of monthly payment > 1000 and I applied log transformation on the values
I want to check the correlation of all variables with percent funded
I want to also check the relationship of other variables
#check for columns that have high positive correlation with PercentFunded
data_copy.corr()[data_copy.corr()['PercentFunded'] > 0.4]['PercentFunded']
#check for columns that have high negative correlation with PercentFunded
data_copy.corr()[data_copy.corr()['PercentFunded'] < -0.4]['PercentFunded']
There is no strong correlation between percentfunded and other variables
#check for columns that have high positive correlation
a = data_copy.corr()[data_copy.corr() > 0.4]
sb.heatmap(a)
plt.title('Positive correlation between features greater than 0.4', fontdict = {'weight': 'bold'})
There is reasonable correlation between
* Recommendation and InvestorsFromFriendsCount
* InvestorsFromFriendsCount and InvestorsFromFriendsAmount which is expected
#check for columns that have high negative correlation
a = data_copy.corr()[data_copy.corr() < -0.4]
sb.heatmap(a)
plt.title('negative correlation between features less than 0.4', fontdict = {'weight': 'bold'})
There is reasonable correlation between LenderYield and
* CreditScoreRangeUpper
* CreditScoreRangeLower
plt.figure(figsize = (15,20))
sb.pairplot(data = data_copy, y_vars = 'PercentFunded')
plt.title('Pairplot of other features against Percent Funded to confirm', fontdict ={'weight': 'bold'})
This plot confirms percentfunded has no correlation with other columns
def boxplots(fig_width, fig_height, cat_data, num_data, data, title,
multiple_subplots = False, use_another_data = False, another_data = None):
'''To plot relationship between categorical data and numeric data
fig_width: int, The width of the plot figure,
fig_height: int, The height of the plot figure,
cat_data: list, The elements of the categorical data,
num_data: list, The elements of the numerical data,
data: DataFrame, The data to plot from,
title: list, The title of the plot,
multiple_subplots: bool, If there should be more than a plot,
use_another_data: bool, If another data should be used in the second subplot,
another_data: str, The data to use if use_another_data is True
'''
plt.figure(figsize = (fig_width, fig_height))
plt.subplot(211)
sb.boxplot(x = cat_data[0], y = num_data[0], data = data, color = 'teal')
plt.xticks(rotation = 90)
plt.title(title[0], fontdict = {'fontweight': 'bold', 'fontsize' : 24})
if multiple_subplots:
plt.subplot(212)
if use_another_data:
data_to_use = another_data
else:
data_to_use = data
sb.boxplot(x = cat_data[0], y = num_data[0], data = data_to_use, color = 'teal')
plt.title(title[1], fontdict = {'fontweight': 'bold', 'fontsize' : 24})
plt.xticks(rotation = 90);
plt.subplots_adjust(hspace = 1.4)
plt.xlabel(f'{cat_data[0]}', fontdict = {'weight': 'bold'})
plt.ylabel(f'{num_data[0]}', fontdict = {'weight': 'bold'})
def full_funding(cat_data, title, fig_size = None):
'''To plot distributions of categorical data in form of counts for fully funded loans
cat_data: str, The elements of the categorical data,
title: str, The title of the plot,
fig_size: tuple, The size of the figure to plot on
'''
if fig_size != None:
plt.figure(figsize = fig_size)
full_funding = data_copy[data_copy['PercentFunded'] == 1]
sb.countplot(x = cat_data, data = full_funding, color = 'brown')
plt.xticks(rotation = 90);
plt.title(f'count of {title} that had full percent funding',
fontdict = {'fontweight': 'bold', 'fontsize' : 24});
plt.xlabel(f'{cat_data}', fontdict = {'weight': 'bold'})
plt.ylabel(f'Count of {cat_data[0]} for loans with full funding', fontdict = {'weight': 'bold'})
not_including_1 = data_copy[data_copy['PercentFunded'] < 1]
boxplots(10, 10, ['LoanStatus'], ['PercentFunded'], data_copy,
['Relationship between percentFunded and Loanstatus',
'Relationship between percentFunded and Loanstatus excluding 1'],
multiple_subplots = True, \
use_another_data = True, another_data = not_including_1)
Loans that are not fully funded with Loan status of past Due(31-60 days) have larger range of PercentFunded than other status, with most having no skew distribution
full_funding('LoanStatus', 'loan status')
not_including_1 = data_copy[data_copy['PercentFunded'] < 1]
boxplots(10, 10, ['ProsperRating (Alpha)'], ['PercentFunded'], data_copy,
['Relationship between percentFunded and ProsperRating',
'Relationship between percentFunded and ProsperRating excluding 1'],
multiple_subplots = True, \
use_another_data = True, another_data = not_including_1)
Rating C has the highest range of percenrt funded not fully funded and HR has the lowest range and 4 have no skewness
full_funding('ProsperRating (Alpha)', 'ProsperRating')
For fully funded loans, Most Loans had rating 'C' and fewer had 'AA'
not_including_1 = data_copy[data_copy['PercentFunded'] < 1]
boxplots(10, 10, ['ListingCategory'], ['PercentFunded'], data_copy,
['Relationship between percentFunded and ListingCategory',
'Relationship between percentFunded and ListingCategory excluding 1'],
multiple_subplots = True, \
use_another_data = True, another_data = not_including_1)
For not fully funded loans, Loans for HomeImprovement, Business, Debt Consolidation, Household Expenses, Medical/Dental and other had largest range of Percent Funded, while Loans for baby and Adoption, Cosmetic Procedures, Green Loans,Not available had the leas range with their values being around 81%, 85%, 71% and 83% respectively
full_funding('ListingCategory', 'Listing Category', (10,4))
For loans filly funded, Loans for Debt Consolidation had the highest frequency.
not_including_1 = data_copy[data_copy['PercentFunded'] < 1]
boxplots(20, 10, ['BorrowerState'], ['PercentFunded'], data_copy,
['Relationship between percentFunded and Borrower State',
'Relationship between percentFunded and Borrower State excluding 1'],
multiple_subplots = True, \
use_another_data = True, another_data = not_including_1)
For not fully loans, Maryland, North Carolina, New Jersey,New York, Washington had the largest range of Percent funded
full_funding('BorrowerState', 'States', (15, 5))
Califrnia had the highest frequency of fully funded loans
boxplots(20, 10, ['Occupation'], ['PercentFunded'], data_copy,
['Relationship between percentFunded and Occupation',
'Relationship between percentFunded and Occupation excluding 1'],
multiple_subplots = True, \
use_another_data = True, another_data = not_including_1)
For not fully funded loans, Car dealer had a low range of Percent Funded but had high funding, Civil Service, Sales-Commission had high Funding range
full_funding('Occupation', 'Occupation', (15,8))
Although Other and Professional had highest frequency of full funding, Teacher, Executive, COmputer Programming also had relatively high frequency
boxplots(20, 10, ['EmploymentStatus'], ['PercentFunded'], data_copy,
['Relationship between percentFunded and Employment Status',
'Relationship between percentFunded and Employment Status excluding 1'],
multiple_subplots = True, \
use_another_data = True, another_data = not_including_1)
For not fully funded loans, Part-time had fundings of 91%, Full-time and employed had higher range
full_funding('EmploymentStatus', 'EmploymentStatus')
Employed and Full time had relatively high frequency of Fully funded loans
boxplots(20, 10, ['IsBorrowerHomeowner'], ['PercentFunded'], data_copy,
['Relationship between percentFunded and IsBorrowerHomeowner',
'Relationship between percentFunded and IsBorrowerHomeowner excluding 1'],
multiple_subplots = True, \
use_another_data = True, another_data = not_including_1)
full_funding('IsBorrowerHomeowner', 'Homeowner')
Home borrowers not-fully funded loans had a higher range of percent funding and higher frequency of full funding
boxplots(20, 10, ['CurrentlyInGroup'], ['PercentFunded'], data_copy,
['Relationship between percentFunded and CurrentlyInGroup',
'Relationship between percentFunded and CurrentlyInGroup excluding 1'],
multiple_subplots = True, \
use_another_data = True, another_data = not_including_1)
Borrowers that did not get full-funding in group had a lower percent funding than those not in groups
full_funding('CurrentlyInGroup', 'borrowers in groups')
Most borrowers not in groups had full funding, but this could be because of the nature of data being analysed, from the exploration of the variable False had alrger proportion than True
boxplots(20, 10, ['LoanOriginationQuarter'], ['PercentFunded'], data_copy,
['Relationship between percentFunded and LoanOriginationQuarter',
'Relationship between percentFunded and LoanOriginationQuarter excluding 1'],
multiple_subplots = True, \
use_another_data = True, another_data = not_including_1)
full_funding('LoanOriginationQuarter', 'Quarter that had loans')
This is similar to tthe distribution of the Quarter Variable, so I Think they have no effect on if it will be fully funded or not}
boxplots(20, 10, ['IncomeVerifiable'], ['PercentFunded'], data_copy,
['Relationship between percentFunded and IncomeVerifiable',
'Relationship between percentFunded and IncomeVerifiable excluding 1'],
multiple_subplots = True, \
use_another_data = True, another_data = not_including_1)
full_funding('IncomeVerifiable', 'verified income')
This is similar to the distribution of theIncome Verifiable Variable, so I Think they have no effect on if it will be fully funded or not, but Most fully funded borrowers had Income Verified
def heatmap(x_data, y_data, number_of_bins, fig_size,more_subplot = False, x_data2 = None):
'''To plot relationship between numeric data and numeric data in terms of density of their frequency
x_data: str, The data to be on the x-axis,
y_data: str, The data to be on the y-axis,
number_of_bins: int, Number of bins,
fig_size: tuple, The size of the figure to plot on,
more_subplot: bool, If there should be another subplot,
x_data2 = str, The data to be on the x-axis, if more_subplot is True
'''
plt.figure(figsize = fig_size)
plt.subplot(211) if more_subplot else plt.subplot(111)
no_of_bins_x = not_including_1[x_data].max()/ number_of_bins
no_of_bins_y = not_including_1[y_data].max()/ number_of_bins
bins_x = np.arange(not_including_1[x_data].min(),
not_including_1[x_data].max() + no_of_bins_x, no_of_bins_x)
bins_y = np.arange(not_including_1[y_data].min(),
not_including_1[y_data].max() + no_of_bins_y, no_of_bins_y)
plt.hist2d(x = x_data, y = y_data, data = not_including_1,
bins = [bins_x, bins_y], cmap = 'viridis_r', )
plt.title(f'Relationship between percentFunded and {x_data}')
plt.xlabel(f'{x_data}', fontdict = {'weight': 'bold'})
plt.ylabel(f'{y_data}', fontdict = {'weight': 'bold'})
if more_subplot:
plt.subplot(212)
no_of_bins_x = not_including_1[x_data2].max()/ number_of_bins
bins_x = np.arange(not_including_1[x_data2].min(),
not_including_1[x_data2].max() + no_of_bins_x, no_of_bins_x)
plt.hist2d(x = x_data2, y = y_data, data = not_including_1,
bins = [bins_x, bins_y], cmap = 'viridis_r', )
plt.title(f'Relationship between percentFunded and {x_data2}')
plt.xlabel(f'{x_data2}', fontdict = {'weight': 'bold'})
plt.ylabel(f'{y_data}', fontdict = {'weight': 'bold'})
plt.subplots_adjust(hspace = 0.9)
plt.colorbar();
def contvcont(x_data, y_data, number_of_bins, fig_size,more_subplot = False, x_data2 = None):
'''To plot relationship between numeric data and numeric data
x_data: str, The data to be on the x-axis,
y_data: str, The data to be on the y-axis,
number_of_bins: int, Number of bins,
fig_size: tuple, The size of the figure to plot on,
more_subplot: bool, If there should be another subplot,
x_data2: str, The data to be on the x-axis, if more_subplot is True
'''
plt.figure(figsize = fig_size)
plt.subplot(211) if more_subplot else plt.subplot(111)
no_of_bins_x = not_including_1[x_data].max()/ number_of_bins
no_of_bins_y = not_including_1[y_data].max()/ number_of_bins
bins_x = np.arange(not_including_1[x_data].min(),
not_including_1[x_data].max() + no_of_bins_x, no_of_bins_x)
bins_y = np.arange(not_including_1[y_data].min(),
not_including_1[y_data].max() + no_of_bins_y, no_of_bins_y)
sb.regplot(x = x_data, y = y_data, data = not_including_1,)
plt.title(f'Relationship between percentFunded and {x_data}')
plt.xlabel(f'{x_data}', fontdict = {'weight': 'bold'})
plt.ylabel(f'{y_data}', fontdict = {'weight': 'bold'})
if more_subplot:
plt.subplot(212)
no_of_bins_x = not_including_1[x_data2].max()/ number_of_bins
bins_x = np.arange(not_including_1[x_data2].min(),
not_including_1[x_data2].max() + no_of_bins_x, no_of_bins_x)
sb.regplot(x = x_data, y = y_data, data = not_including_1)
plt.title(f'Relationship between percentFunded and {x_data2}')
plt.xlabel(f'{x_data}', fontdict = {'weight': 'bold'})
plt.ylabel(f'{y_data}', fontdict = {'weight': 'bold'})
plt.subplots_adjust(hspace = 0.9)
def full_fund_hist(data, number_of_bins, fig_size,more_subplot = False, data2 = None, log_plot = False):
'''To plot frequency of numeric data and numeric data for fully funded loans
data: str, The numeric data to plot,
number_of_bins: int, Number of bins,
fig_size: tuple, The size of the figure to plot on,
more_subplot: bool, If there should be another subplot,
data2: str, The data to be on the x-axis, if more_subplot is
log_plot: bool, if the second subplot should be a logarithmic plot
'''
full_funding = data_copy[data_copy['PercentFunded'] == 1]
plt.figure(figsize = fig_size)
plt.subplot(121) if more_subplot else plt.subplot(111)
no_of_bins = full_funding[data].max()/ number_of_bins
bins = np.arange(full_funding[data].min(),
full_funding[data].max() + no_of_bins, no_of_bins)
sb.histplot(x = data, data = full_funding, bins = bins, stat = 'density')
plt.title(f'{data} with full percentFunded')
plt.xlabel(f'{data}', fontdict = {'weight': 'bold'})
plt.ylabel(f'frequency of {data} for fully funded loans', fontdict = {'weight': 'bold'})
if more_subplot:
plt.subplot(122)
if log_plot:
no_of_bins = np.log(full_funding[data] + 1).max()/ number_of_bins
bins = np.arange(np.log(full_funding[data] + 1).min(),
np.log(full_funding[data] + 1).max() + no_of_bins, no_of_bins)
plt.xscale('log')
plt.xlim(0.9e0, 2e1)
plt.xticks([0.9e0, 2e0, 4e0, 6e0, 8e0, 1e1], ['0', '2', '4', '6', '8', '10'])
else:
no_of_bins = full_funding[data2].max()/ number_of_bins
bins = np.arange(full_funding[data2].min(),
full_funding[data2].max() + no_of_bins, no_of_bins)
sb.histplot(x = data if log_plot else data2
, data = full_funding, bins = bins, stat = 'density')
plt.xlabel(f'{data}', fontdict = {'weight': 'bold'})
plt.ylabel(f'frequency of {data} for fully funded loans', fontdict = {'weight': 'bold'})
if log_plot:
plt.title(f'logarithm plot of {data} with full percentFunded')
else:
plt.title(f'logarithm plot of {data2} with full percentFunded')
plt.subplots_adjust(wspace = 0.9)
contvcont('CreditScoreRangeLower', 'PercentFunded', 50, (10,10),more_subplot = True, x_data2 = 'CreditScoreRangeUpper')
For fully Funded loans, The credit SCore range have no correlation with Percent Funded
full_fund_hist('CreditScoreRangeLower', 50, (10,10),more_subplot = True, data2 = 'CreditScoreRangeUpper')
Most Fully funded loans had credit score of 700
heatmap('TotalCreditLinespast7years', 'PercentFunded', 50, (15, 6))
Most total credit lines were between 10 and 40, but but most credit lines of 20 and 40 had 70% funding
full_fund_hist('TotalCreditLinespast7years', 50, (10,6))
Credit Lines of close to 18 and close to 28 had highes frequency of full funding
full_fund_hist('DebtToIncomeRatio', 50, (10,6), more_subplot = True, log_plot = True)
Debt-to-Income ratio of 10 had the highest fequency of full fended loans
contvcont('DebtToIncomeRatio', 'PercentFunded', 50, (10,8),\
more_subplot = True, x_data2 = 'CreditScoreRangeUpper')
def correlation(fig_size, data, multiple_subplot = False):
'''To plot correlation between numeric data and PercentFunded
fig_size: tuple, The size of the figure to plot on,
data: str, The numeric data to plot,
multiple_subplot: bool, If there should be another subplot for not fully funded loans,
'''
plt.figure(figsize = fig_size)
plt.subplot(121) if multiple_subplot else plt.subplots(111)
sb.regplot(x = data, y = 'PercentFunded', data = data_copy)
plt.title(f'Relationship between PercentFunded \nand {data}')
plt.xlabel(f'{data}', fontdict = {'weight': 'bold'})
plt.ylabel(f'PercentFunded', fontdict = {'weight': 'bold'})
if multiple_subplot:
plt.subplot(122)
sb.regplot(x = data, y = 'PercentFunded', data = not_including_1)
plt.title(f'Relationship between PercentFunded \nand {data} of loans'
'that were not fully funded')
plt.subplots_adjust(wspace = 0.9)
plt.xlabel(f'{data}', fontdict = {'weight': 'bold'})
plt.ylabel(f'PercentFunded', fontdict = {'weight': 'bold'})
def fully_funded_for_numeric(fig_size, column):
'''To plot frequency of numeric data for fully funded loans
fig_size: tuple, The size of the figure to plot on,
column: str, The numeric data to plot,
'''
plt.figure(figsize = fig_size)
plt.subplot(111)
data_to_use = data_copy[data_copy['PercentFunded'] == 1]
no_of_bins = data_to_use[column].max()/ 50
bins = np.arange(data_to_use[column].min(),
data_to_use[column].max() + no_of_bins, no_of_bins)
sb.histplot(x = column, data = data_to_use, bins = bins)
plt.xlabel(f'{column}', fontdict = {'weight': 'bold'})
plt.ylabel(f'Frequency of {column} for fully funded loans', fontdict = {'weight': 'bold'})
fully_funded_for_numeric((10,4), 'DebtToIncomeRatio')
The DebttoIncomeRatio has a very very low correlation with PercentFunded
correlation((10, 8), 'MonthlyLoanPayment', multiple_subplot = True)
For non-full-funded loans, Monthly payment had a reasonable high positive correlation with percentFunded
correlation((10, 8), 'Recommendations', multiple_subplot = True)
For non-fully funded loans, Recommendations had a little negative correlation with percentFunded
def recommendations(column):
'''To plot distribution of numeric data on recommendations for loans not fully funded
column: str, The numeric data to plot,
'''
fig, ax = plt.subplots(1,2)
reco = not_including_1['Recommendations'].unique()
for i, ax in zip(reco, ax.ravel()):
no_recommendation = not_including_1[not_including_1['Recommendations'] == i]
sb.boxplot(x = 'Recommendations', y = column, data = no_recommendation, ax = ax)
plt.text(x = 1, y = 0.8, s = 'The only number of recommendation of loans that do not have '
'full funding are 0 and 1')
ax.set_xlabel('Recommendation', fontdict = {'weight': 'bold'})
ax.set_ylabel(f'PercentFunded', fontdict = {'weight': 'bold'})
plt.title(f'Relationship between Recomendations and {column} of not fully funded loans',
fontdict = {'weight':'bold'})
plt.subplots_adjust(wspace = 0.9)
recommendations('PercentFunded')
For Recommendation of 1 most Funding were between 72% and 75%
recommendations('InvestmentFromFriendsCount')
This infers that recommendations of might have a relationship with friends that invest, because recommmendations of 1 has a range of percent funded, though the distribution is negatively skewed
full_funded = data_copy[data_copy['PercentFunded'] == 1]
ax = plt.subplot(111)
prop = full_funded['Recommendations'].value_counts() / full_funded['Recommendations'].shape[0]
label = [str(x) for x in full_funded['Recommendations'].value_counts().index.to_list()]
sb.barplot(x = prop, y = label, ax = ax)
for i in range(len(label)):
prop = full_funded['Recommendations'].value_counts().iloc[i] / \
full_funded['Recommendations'].shape[0]
percent = prop * 100
value = full_funded['Recommendations'].value_counts().iloc[i]
plt.text(y = i, x = prop, s = '{:0.3f}% ,{}counts'.format(percent, value))
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
plt.title('Proportion of Recommendations for fully funded loans', fontdict = {'Weight': 'bold'},
pad = 7)
plt.xlabel('proportion', fontdict = {'weight':'bold'})
plt.ylabel('Recommendations', fontdict = {'weight':'bold'});
From distribution of Recommendtaions, most loans had no recommendation, therfore It has no effect on PercentFunding
correlation((10, 8), 'InvestmentFromFriendsAmount', multiple_subplot = True)
InvestmentsfromfriendsAMount has very little correlation with PercentFunded, for not fully funded loans
def relation(x_data, y_data, data):
'''To plot relationship between numeric data
x_data: str, The numeric data to plot on x-axis,
y_data: str, The numeric data to plot on y-axis,
data: DataFrame, The data to plot from
'''
sb.regplot(x = x_data, y = y_data, data = data);
plt.title(f'relationship between {x_data} and {y_data}', fontdict = {'weight':'bold'}, pad = 7)
plt.xlabel(f'{x_data}', fontdict = {'weight':'bold'})
plt.ylabel(f'{y_data}', fontdict = {'weight':'bold'});
relation('InvestmentFromFriendsCount', 'Investors', data_copy)
High correlattion (expected)
relation('Recommendations', 'InvestmentFromFriendsCount', data_copy)
High correlation between Recommendations and friends that invest
relation('LenderYield', 'PercentFunded', not_including_1)
Little to no correlation between LendeYield and PercentFunded
def cat_rel(column, hue):
'''To plot counts of two categorical varable
x_data: str, The numeric data to plot on x-axis,
y_data: str, The numeric data to plot on y-axis,
data: DataFrame, The data to plot from
'''
sb.countplot(x = column, hue = hue, data = data_copy)
plt.xticks(rotation = 90)
plt.title(f'Proportion of {column} and {hue}', fontdict = {'weight':'bold'}, pad = 7)
plt.xlabel(f'{column}', fontdict = {'weight':'bold'})
plt.ylabel(f'counts of {column} and {hue}', fontdict = {'weight':'bold'});
cat_rel('EmploymentStatus', 'IncomeVerifiable',)
Most Employed and full-time borrowers had ttheir Income Verifiable, while most self-employed did not have their incomes verifiable
cat_rel('ProsperRating (Alpha)', 'CurrentlyInGroup',)
across all ratings, most borrowers were not in groups
The correlation between the variable and the other variables was very low almost 0.
data_copy.columns
def vari(col = None, numerical = False, facet = False, minimum = None, pad = None):
'''To plot relationship between LenderYield, PercentFunded and one categorical variable or
numerical variable
col: str, categorical variable to plot,
numerical: str, numerical variable to plot,
facet: bool, if it should be facet plot,
minimum: int, minimum value for colorbar,
pad = The pad for title
'''
plt.figure(figsize = (10,5))
if numerical:
plt.scatter(data = not_including_1, x = 'LenderYield', y = 'PercentFunded', c = col, \
cmap = 'Accent', vmin = minimum)
plt.colorbar(label = col)
elif facet:
g = sb.FacetGrid(data = not_including_1, col = col, col_wrap =3)
g.map(sb.regplot, 'LenderYield', 'PercentFunded')
plt.title(f'Relationship between LenderYield, PercentFunded and {col}',
fontdict = {'weight': 'bold'}, pad = pad)
else:
sb.scatterplot(x = 'LenderYield', y = 'PercentFunded', data = not_including_1,\
hue = col, s = 70);
if not facet:
plt.title(f'Relationship between LenderYield, PercentFunded and {col}',
fontdict = {'weight': 'bold'}, pad = 10)
plt.xlabel('LenderYield', fontdict = {'weight':'bold'})
plt.ylabel('PercentFunded', fontdict = {'weight':'bold'});
plt.legend(loc = 'right', bbox_to_anchor = (1.5,1))
vari(col = 'EmploymentStatus')
Loans of Most borrowers that are not employed, that have employment status of other have more lender yield, Part-time had high LenderYield and Percent Funded of about 87%, Self-employed has high LenderYield
vari('LoanStatus', facet = True, pad = 650)
even across the LoanStatus feature, the correlation between PercentFunded and lender yiels is neutral or almost neutral.
vari('ProsperRating (Alpha)')
All ratings had the same range of PercentFunded except E that had the least PercentFUnded to be above 70%, From AA to HR there's decrease in LenderYield
vari('CurrentlyInGroup')
Most loans of borrowers in groups had lower Lender Yield, but for not fully funded loans their percentfunded is between 72% and 97%.
vari(col = 'LoanOriginationQuarter', facet= True, pad = 250)
even across the LoanOriginationQuarter feature, the correlation between PercentFunded and lender yiels is neutral or almost neutral.
vari(col = 'IncomeVerifiable')
Most borrowers with Income not verifiabl had high LenderYield
vari(numerical= True, col= 'TotalCreditLinespast7years')
Most loans with totalcredit between 65 and 75 have low Funds percent less than 75%, loans with totalcredit between 35 and 45 have high Funds percent above than 85%
vari(numerical= True, col= 'CreditScoreRangeLower')
Most loans of Credit score between 625 to 725 have higher yields,Most loans of Credit score from730 above had low lender yield
vari(numerical= True, col = 'CreditScoreRangeUpper', minimum = 19)
Most loans of Credit score from 750 and above have lower yields,Most loans of Credit score between 550 and 750 had high lender yield
vari(numerical= True, col = 'MonthlyLoanPayment')
most loans with Monthly Loan payments between 650 and 800 have high PercentFunded and lower LenderYield
most loans with Monthly Loan payments between 300 and 500 have lower
most loans with Monthly Loan payments between 500 to 650 have high PercentFunded and lower LenderYield
def show(xs, y):
'''To plot differnt subplots with different sizes
xs: list, List of categorical variables to plot,
y: str, numerical variable to plot agains
'''
fig = plt.figure(figsize = (10, 5 * len(xs)//2))
gs = GridSpec(3, 2, figure = fig)
ax1 = fig.add_subplot(gs[0,0])
ax2 = fig.add_subplot(gs[0,1])
ax3 = fig.add_subplot(gs[1,0])
ax4 = fig.add_subplot(gs[1,1])
ax5 = fig.add_subplot(gs[2,0:])
full_funds(fig, xs, y)
fig.suptitle(f'Relationship between Categorical variables and {y} for fully funded loans',
fontdict = {'weight': 'bold'})
plt.show()
def full_funds(fig, xs, y):
'''To plot relationship between numerical variable and categorical variables for fully funded loans
fig: matplotlib.pyplot.figure, The figure to plot on,
xs: list, List of categorical variables to plot,
y: str, numerical variable to plot agains
'''
for i, ax in enumerate(fig.axes):
if i == 4:
sb.barplot(x = xs[i], y = y, data = full_funded, color = 'grey', ax = ax, ci = None)
plt.xticks(rotation = 90)
else:
sb.barplot(y = xs[i], x = y, data = full_funded, color = 'grey', ax = ax, ci = None)
plt.subplots_adjust(wspace = 0.9, hspace = 0.4)
cat = ['ProsperRating (Alpha)', 'LoanStatus', 'ListingCategory','LoanOriginationQuarter', 'BorrowerState']
show(xs = cat, y = 'LenderYield')
show(xs = cat, y = 'MonthlyLoanPayment')
show(xs = cat, y = 'TotalCreditLinespast7years')
show(xs = cat, y = 'CreditScoreRangeUpper')
show(xs = cat, y = 'CreditScoreRangeLower')
Higher credit score had lower lender yields and lower credit score had higher lender yields
I performed dat wrangling on the data to how tidy and clean the data was. The data was almosttidy as each row was an observation and each column was a variable, there were no duplicates also but there were to columns(Credit grade and prosperRating (Alpha)) that had to be joined.
It was not clean however as the datatypes were not correct and there were so many missing values.I picked the variable of interest and picked variables thet would be necessary in the analysis. I cleaned the data while making sure not to lose so many data. I made exploratory analysis on distribution of all the features I picked, Checked the variation in two variables against each other and even three variables.
My variable of interest
PerentFundeddid not have strong correlation with the other variables. It might be better if more diverse data is collected as most of the observations hadPercwntFundingvalue of 1.0